package com.chance.cc.crawler.prod.command.job.domain.vm.xcar.Geely;

import com.alibaba.fastjson.JSON;
import com.chance.cc.crawler.core.CrawlerEnum;
import com.chance.cc.crawler.core.CrawlerJob;
import com.chance.cc.crawler.core.downloader.HttpPage;
import com.chance.cc.crawler.core.filter.FilterUtils;
import com.chance.cc.crawler.core.record.CrawlerRecord;
import com.chance.cc.crawler.core.record.CrawlerRequestRecord;
import com.chance.cc.crawler.meta.core.bean.CrawlerMetaConstant;
import com.chance.cc.crawler.meta.core.bean.job.CrawlerScheduleJob;
import com.chance.cc.crawler.prod.command.job.domain.vm.xcar.XCarAutoCommonCrawlerSchedulerJob;
import org.apache.commons.lang3.StringUtils;

import static com.chance.cc.crawler.core.CrawlerEnum.CrawlerRequestType.*;
import static com.chance.cc.crawler.development.scripts.allfeild.AICCommonField.Tag_Site_Info;

/**
 * created by CC on 2020/11/26
 * mail 279020185@qq.com
 */
public class XcarForumTopicGeelyTraceCrawlerScheduleJob extends XCarAutoCommonCrawlerSchedulerJob {

    public static String site = "forum";
    public static String meta_site = "geely";
    public static String site_info = "geely";

    public static void main(String[] args) {
        publishXcarSchedulerJob();
    }

    public static CrawlerJob publishXcarSchedulerJob(){
        CrawlerJob crawlerJob = xcarCwalerJob();


        //发布定时采集作业
        CrawlerScheduleJob crawlerScheduleJob = new CrawlerScheduleJob();
        crawlerScheduleJob.setDomain(domain);
        crawlerScheduleJob.setCrawlerJob(JSON.toJSONString(crawlerJob));
        crawlerScheduleJob.setJobType(CrawlerMetaConstant.ScheduleCrawlerJobType.crawler.enumVal());
        crawlerScheduleJob.setNote("爱卡汽车论坛发帖追溯采集");
        crawlerScheduleJob.setCrawlerKey(crawlerJob.generateCrawlerKey());
        HttpPage page = metaServiceCommand.addOrUpdateCrawlerScheduleJob(crawlerScheduleJob);
        System.out.println("发布作业：" + page.getRawText());

        return crawlerJob;
    }

    /**
     * taobao采集
     * @return
     */
    public static CrawlerJob xcarCwalerJob() {

        CrawlerRequestRecord requestRecord = CrawlerRequestRecord.builder()
                .startPageRequest(domain, CrawlerEnum.CrawlerRequestType.turnPage)
                .httpUrl("https://www.xcar.com.cn/bbs/xbbsapi/forumdisplay/get_thread_list.php")
                .httpHead("user-agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36")
                .releaseTime(System.currentTimeMillis())
                .needWashed(false)
                .needParsed(false)
                .filter(CrawlerEnum.CrawlerRecordFilter.dateRange)
                .addFilterInfo(FilterUtils.dateRangeFilterInfo(24*7+4,null))
//                .proxy(proxy)
                .build();
        requestRecord.setDownload(false);
        requestRecord.setSkipPipeline(true);
        requestRecord.tagsCreator().bizTags().addSite(site);
        requestRecord.tagsCreator().bizTags().addCustomKV(Tag_Site_Info,site_info);

        CrawlerRecord filterCrawlerRecord = new CrawlerRecord();
        filterCrawlerRecord.setFilter(CrawlerEnum.CrawlerRecordFilter.dateRange);
        filterCrawlerRecord.addFilterInfo(FilterUtils.dateRangeFilterInfo(24+4,null));
        requestRecord.tagsCreator().bizTags().addCustomKV("comment_record_filter_info", JSON.toJSONString(filterCrawlerRecord));


        //初始 comment crawler request record
        String requestQueueName = StringUtils.joinWith("-","crawler",domain,site,site_info,"queue");
        CrawlerRequestRecord initNewsCrawlerRecord = CrawlerRequestRecord.builder()
                .startPageRequest("xca",turnPageItem)
                .httpUrl(metaServiceHttpPrefix + "/v1/meta/xcar/keys?site="+meta_site)
                .requestLabelTag(supportSource)
                .requestLabelTag(internalDownload)
                .build();

        CrawlerJob crawlerJob = CrawlerJob.builder()
                .crawlerJobThreadNumber(10)
                .triggerInfo(
                        domain,
                        CrawlerMetaConstant.ScheduleJobTrigger_Cron,
                        System.currentTimeMillis(),
                        StringUtils.joinWith("-",domain,site,site_info, CrawlerMetaConstant.ScheduleJobTriggerJob_Realtime))
                .crawlerRequestQueue(CrawlerMetaConstant.redisRequestQueue(requestQueueName))
                .fileResultPipeline(null, "/data/chance_crawler_runner/logs/node/xcar/xcar_forum_jeely_trace.log", false)
                .kafkaResultPipeline(null, kafkaTopciForTraceJob, null)
                .requestRecord(requestRecord)
                .supportRecord(initNewsCrawlerRecord)
                .build();
        crawlerJob.getScheduleTags().getCategoryTag().addKVTag("order","dateline");
        crawlerJob.getScheduleTags().getCategoryTag().addLabelTag(CrawlerEnum.CrawlerDataType.comment.enumVal());
        return crawlerJob;
    }

}
